install.packages("e1071",repos = "http://cran.us.r-project.org")
## Installing package into 'D:/Documents/R/win-library/3.6'
## (as 'lib' is unspecified)
## package 'e1071' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'e1071'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## D:\Documents\R\win-library\3.6\00LOCK\e1071\libs\x64\e1071.dll to D:
## \Documents\R\win-library\3.6\e1071\libs\x64\e1071.dll: Permission denied
## Warning: restored 'e1071'
##
## The downloaded binary packages are in
## C:\Users\Abdullah\AppData\Local\Temp\RtmpicG6vJ\downloaded_packages
install.packages("ggplot2",repos = "http://cran.us.r-project.org")
## Installing package into 'D:/Documents/R/win-library/3.6'
## (as 'lib' is unspecified)
## package 'ggplot2' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\Abdullah\AppData\Local\Temp\RtmpicG6vJ\downloaded_packages
install.packages("corrplot",repos = "http://cran.us.r-project.org")
## Installing package into 'D:/Documents/R/win-library/3.6'
## (as 'lib' is unspecified)
## package 'corrplot' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\Abdullah\AppData\Local\Temp\RtmpicG6vJ\downloaded_packages
install.packages("ggcorrplot",repos = "http://cran.us.r-project.org")
## Installing package into 'D:/Documents/R/win-library/3.6'
## (as 'lib' is unspecified)
## package 'ggcorrplot' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\Abdullah\AppData\Local\Temp\RtmpicG6vJ\downloaded_packages
install.packages("klaR",repos = "http://cran.us.r-project.org")
## Installing package into 'D:/Documents/R/win-library/3.6'
## (as 'lib' is unspecified)
## package 'klaR' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\Abdullah\AppData\Local\Temp\RtmpicG6vJ\downloaded_packages
install.packages("cluster",repos = "http://cran.us.r-project.org")
## Installing package into 'D:/Documents/R/win-library/3.6'
## (as 'lib' is unspecified)
## package 'cluster' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'cluster'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## D:\Documents\R\win-library\3.6\00LOCK\cluster\libs\x64\cluster.dll to D:
## \Documents\R\win-library\3.6\cluster\libs\x64\cluster.dll: Permission
## denied
## Warning: restored 'cluster'
##
## The downloaded binary packages are in
## C:\Users\Abdullah\AppData\Local\Temp\RtmpicG6vJ\downloaded_packages
install.packages("fpc",repos = "http://cran.us.r-project.org")
## Installing package into 'D:/Documents/R/win-library/3.6'
## (as 'lib' is unspecified)
## package 'fpc' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\Abdullah\AppData\Local\Temp\RtmpicG6vJ\downloaded_packages
install.packages("class",repos = "http://cran.us.r-project.org")
## Installing package into 'D:/Documents/R/win-library/3.6'
## (as 'lib' is unspecified)
## package 'class' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'class'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## D:\Documents\R\win-library\3.6\00LOCK\class\libs\x64\class.dll to D:
## \Documents\R\win-library\3.6\class\libs\x64\class.dll: Permission denied
## Warning: restored 'class'
##
## The downloaded binary packages are in
## C:\Users\Abdullah\AppData\Local\Temp\RtmpicG6vJ\downloaded_packages
install.packages("rpart",repos = "http://cran.us.r-project.org")
## Installing package into 'D:/Documents/R/win-library/3.6'
## (as 'lib' is unspecified)
## package 'rpart' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'rpart'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## D:\Documents\R\win-library\3.6\00LOCK\rpart\libs\x64\rpart.dll to D:
## \Documents\R\win-library\3.6\rpart\libs\x64\rpart.dll: Permission denied
## Warning: restored 'rpart'
##
## The downloaded binary packages are in
## C:\Users\Abdullah\AppData\Local\Temp\RtmpicG6vJ\downloaded_packages
install.packages("cowplot",repos = "http://cran.us.r-project.org")
## Installing package into 'D:/Documents/R/win-library/3.6'
## (as 'lib' is unspecified)
## package 'cowplot' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\Abdullah\AppData\Local\Temp\RtmpicG6vJ\downloaded_packages
install.packages("randomForest",repos = "http://cran.us.r-project.org")
## Installing package into 'D:/Documents/R/win-library/3.6'
## (as 'lib' is unspecified)
## package 'randomForest' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'randomForest'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying D:
## \Documents\R\win-library\3.6\00LOCK\randomForest\libs\x64\randomForest.dll
## to D:\Documents\R\win-library\3.6\randomForest\libs\x64\randomForest.dll:
## Permission denied
## Warning: restored 'randomForest'
##
## The downloaded binary packages are in
## C:\Users\Abdullah\AppData\Local\Temp\RtmpicG6vJ\downloaded_packages
install.packages("rpart.plot",repos = "http://cran.us.r-project.org")
## Installing package into 'D:/Documents/R/win-library/3.6'
## (as 'lib' is unspecified)
## package 'rpart.plot' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\Abdullah\AppData\Local\Temp\RtmpicG6vJ\downloaded_packages
install.packages("tree",repos = "http://cran.us.r-project.org")
## Installing package into 'D:/Documents/R/win-library/3.6'
## (as 'lib' is unspecified)
## package 'tree' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'tree'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## D:\Documents\R\win-library\3.6\00LOCK\tree\libs\x64\tree.dll to D:
## \Documents\R\win-library\3.6\tree\libs\x64\tree.dll: Permission denied
## Warning: restored 'tree'
##
## The downloaded binary packages are in
## C:\Users\Abdullah\AppData\Local\Temp\RtmpicG6vJ\downloaded_packages
install.packages("glmnet",repos = "http://cran.us.r-project.org")
## Installing package into 'D:/Documents/R/win-library/3.6'
## (as 'lib' is unspecified)
## package 'glmnet' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'glmnet'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## D:\Documents\R\win-library\3.6\00LOCK\glmnet\libs\x64\glmnet.dll to D:
## \Documents\R\win-library\3.6\glmnet\libs\x64\glmnet.dll: Permission denied
## Warning: restored 'glmnet'
##
## The downloaded binary packages are in
## C:\Users\Abdullah\AppData\Local\Temp\RtmpicG6vJ\downloaded_packages
library(glmnet)
## Loading required package: Matrix
## Loading required package: foreach
## Loaded glmnet 2.0-18
library("tree")
library("rpart.plot")
## Loading required package: rpart
library("cowplot")
##
## ********************************************************
## Note: As of version 1.0.0, cowplot does not change the
## default ggplot2 theme anymore. To recover the previous
## behavior, execute:
## theme_set(theme_cowplot())
## ********************************************************
library("randomForest")
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
library("rpart")
library("class")
library("fpc")
library("cluster")
library("plyr")
library("klaR")
## Loading required package: MASS
library("ggplot2")
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
##
## margin
library("e1071")
library("corrplot")
## corrplot 0.84 loaded
library("ggcorrplot")
MAE <- function(actual, predicted){
mean(abs(actual- predicted))
}
RMSE <- function(actual, predicted){
sqrt(mean((predicted-actual)^2))
}
In this part we shall mostly look to clean our data, which includes: address missing/duplicate values, look for outliers, correct data types , fixing categorical variables, Distribution of variables, Low variance filter.
raw_data = read.csv2(file='C:\\Users\\Abdullah\\Desktop\\housePrices\\train.csv', header = T, sep = ",", dec = ".",stringsAsFactors = FALSE)
modified_data = raw_data ## To ensure we do not touch original data, we replicate into another df and use that as modified version
str(raw_data)
## 'data.frame': 1460 obs. of 81 variables:
## $ Id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ MSSubClass : int 60 20 60 70 60 50 20 60 50 190 ...
## $ MSZoning : chr "RL" "RL" "RL" "RL" ...
## $ LotFrontage : int 65 80 68 60 84 85 75 NA 51 50 ...
## $ LotArea : int 8450 9600 11250 9550 14260 14115 10084 10382 6120 7420 ...
## $ Street : chr "Pave" "Pave" "Pave" "Pave" ...
## $ Alley : chr NA NA NA NA ...
## $ LotShape : chr "Reg" "Reg" "IR1" "IR1" ...
## $ LandContour : chr "Lvl" "Lvl" "Lvl" "Lvl" ...
## $ Utilities : chr "AllPub" "AllPub" "AllPub" "AllPub" ...
## $ LotConfig : chr "Inside" "FR2" "Inside" "Corner" ...
## $ LandSlope : chr "Gtl" "Gtl" "Gtl" "Gtl" ...
## $ Neighborhood : chr "CollgCr" "Veenker" "CollgCr" "Crawfor" ...
## $ Condition1 : chr "Norm" "Feedr" "Norm" "Norm" ...
## $ Condition2 : chr "Norm" "Norm" "Norm" "Norm" ...
## $ BldgType : chr "1Fam" "1Fam" "1Fam" "1Fam" ...
## $ HouseStyle : chr "2Story" "1Story" "2Story" "2Story" ...
## $ OverallQual : int 7 6 7 7 8 5 8 7 7 5 ...
## $ OverallCond : int 5 8 5 5 5 5 5 6 5 6 ...
## $ YearBuilt : int 2003 1976 2001 1915 2000 1993 2004 1973 1931 1939 ...
## $ YearRemodAdd : int 2003 1976 2002 1970 2000 1995 2005 1973 1950 1950 ...
## $ RoofStyle : chr "Gable" "Gable" "Gable" "Gable" ...
## $ RoofMatl : chr "CompShg" "CompShg" "CompShg" "CompShg" ...
## $ Exterior1st : chr "VinylSd" "MetalSd" "VinylSd" "Wd Sdng" ...
## $ Exterior2nd : chr "VinylSd" "MetalSd" "VinylSd" "Wd Shng" ...
## $ MasVnrType : chr "BrkFace" "None" "BrkFace" "None" ...
## $ MasVnrArea : int 196 0 162 0 350 0 186 240 0 0 ...
## $ ExterQual : chr "Gd" "TA" "Gd" "TA" ...
## $ ExterCond : chr "TA" "TA" "TA" "TA" ...
## $ Foundation : chr "PConc" "CBlock" "PConc" "BrkTil" ...
## $ BsmtQual : chr "Gd" "Gd" "Gd" "TA" ...
## $ BsmtCond : chr "TA" "TA" "TA" "Gd" ...
## $ BsmtExposure : chr "No" "Gd" "Mn" "No" ...
## $ BsmtFinType1 : chr "GLQ" "ALQ" "GLQ" "ALQ" ...
## $ BsmtFinSF1 : int 706 978 486 216 655 732 1369 859 0 851 ...
## $ BsmtFinType2 : chr "Unf" "Unf" "Unf" "Unf" ...
## $ BsmtFinSF2 : int 0 0 0 0 0 0 0 32 0 0 ...
## $ BsmtUnfSF : int 150 284 434 540 490 64 317 216 952 140 ...
## $ TotalBsmtSF : int 856 1262 920 756 1145 796 1686 1107 952 991 ...
## $ Heating : chr "GasA" "GasA" "GasA" "GasA" ...
## $ HeatingQC : chr "Ex" "Ex" "Ex" "Gd" ...
## $ CentralAir : chr "Y" "Y" "Y" "Y" ...
## $ Electrical : chr "SBrkr" "SBrkr" "SBrkr" "SBrkr" ...
## $ X1stFlrSF : int 856 1262 920 961 1145 796 1694 1107 1022 1077 ...
## $ X2ndFlrSF : int 854 0 866 756 1053 566 0 983 752 0 ...
## $ LowQualFinSF : int 0 0 0 0 0 0 0 0 0 0 ...
## $ GrLivArea : int 1710 1262 1786 1717 2198 1362 1694 2090 1774 1077 ...
## $ BsmtFullBath : int 1 0 1 1 1 1 1 1 0 1 ...
## $ BsmtHalfBath : int 0 1 0 0 0 0 0 0 0 0 ...
## $ FullBath : int 2 2 2 1 2 1 2 2 2 1 ...
## $ HalfBath : int 1 0 1 0 1 1 0 1 0 0 ...
## $ BedroomAbvGr : int 3 3 3 3 4 1 3 3 2 2 ...
## $ KitchenAbvGr : int 1 1 1 1 1 1 1 1 2 2 ...
## $ KitchenQual : chr "Gd" "TA" "Gd" "Gd" ...
## $ TotRmsAbvGrd : int 8 6 6 7 9 5 7 7 8 5 ...
## $ Functional : chr "Typ" "Typ" "Typ" "Typ" ...
## $ Fireplaces : int 0 1 1 1 1 0 1 2 2 2 ...
## $ FireplaceQu : chr NA "TA" "TA" "Gd" ...
## $ GarageType : chr "Attchd" "Attchd" "Attchd" "Detchd" ...
## $ GarageYrBlt : int 2003 1976 2001 1998 2000 1993 2004 1973 1931 1939 ...
## $ GarageFinish : chr "RFn" "RFn" "RFn" "Unf" ...
## $ GarageCars : int 2 2 2 3 3 2 2 2 2 1 ...
## $ GarageArea : int 548 460 608 642 836 480 636 484 468 205 ...
## $ GarageQual : chr "TA" "TA" "TA" "TA" ...
## $ GarageCond : chr "TA" "TA" "TA" "TA" ...
## $ PavedDrive : chr "Y" "Y" "Y" "Y" ...
## $ WoodDeckSF : int 0 298 0 0 192 40 255 235 90 0 ...
## $ OpenPorchSF : int 61 0 42 35 84 30 57 204 0 4 ...
## $ EnclosedPorch: int 0 0 0 272 0 0 0 228 205 0 ...
## $ X3SsnPorch : int 0 0 0 0 0 320 0 0 0 0 ...
## $ ScreenPorch : int 0 0 0 0 0 0 0 0 0 0 ...
## $ PoolArea : int 0 0 0 0 0 0 0 0 0 0 ...
## $ PoolQC : chr NA NA NA NA ...
## $ Fence : chr NA NA NA NA ...
## $ MiscFeature : chr NA NA NA NA ...
## $ MiscVal : int 0 0 0 0 0 700 0 350 0 0 ...
## $ MoSold : int 2 5 9 2 12 10 8 11 4 1 ...
## $ YrSold : int 2008 2007 2008 2006 2008 2009 2007 2009 2008 2008 ...
## $ SaleType : chr "WD" "WD" "WD" "WD" ...
## $ SaleCondition: chr "Normal" "Normal" "Normal" "Abnorml" ...
## $ SalePrice : int 208500 181500 223500 140000 250000 143000 307000 200000 129900 118000 ...
head(raw_data)
As we can see, we have 81 variables in our data set; 1 ID and 1 salesPrice, 43 categorical and 36 quantitative. Data entries are either int or char data type.
summary(raw_data$SalePrice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 34900 129975 163000 180921 214000 755000
typeof(raw_data$SalePrice)
## [1] "integer"
myhist <- hist(raw_data$SalePrice)
multiplier <- myhist$counts / myhist$density
mydensity <- density(raw_data$SalePrice)
mydensity$y <- mydensity$y * multiplier[1]
plot(myhist, xlab = "Sales Price", main = "Histogram of Sales")
lines(mydensity)
We can see that the average sales price of a house is $181k, the distribution of which is skewed to the right. As we can see from the histogram, the right tail is longer and the mass of the concentration of data is to the left of the graph, suggesting a positive skew value. The peak also looks very sharp, assuming a high kurtosis value.
boxplot(raw_data$SalePrice)
We can see that there are many outlier in the SalePrice. In fact, we can measure knowing that outliers are classified as values over 3rd Quartile + 1.5*(IQR)
count = 0;
x = 1;
while (x < 1461) {
if (raw_data$SalePrice[x] > 340000){
count = count + 1
}
x = x + 1
}
cat("Number of Sales that lie as outliers are: ", count)
## Number of Sales that lie as outliers are: 61
Now that we’ve seen that there are 61 outliers in the Sale Price, lets look into them.
outlier_sales <- subset(raw_data, raw_data$SalePrice > 340000)
outlier_sales
As we can see, there are missing values for many of the entries. Perhaps it is better that we look to address missing values in our other 80 variables.
# Counting number of nulls in each col
x = 1
cat("---NULL COUNT---\n")
## ---NULL COUNT---
while (x<81){
if(sum(is.na((raw_data[x]))>0)){
cat("Number of nulls in ",(colnames(raw_data[x])), ": ")
cat(sum(is.na(raw_data[x])), "\n")
}
x = x + 1
}
## Number of nulls in LotFrontage : 259
## Number of nulls in Alley : 1369
## Number of nulls in MasVnrType : 8
## Number of nulls in MasVnrArea : 8
## Number of nulls in BsmtQual : 37
## Number of nulls in BsmtCond : 37
## Number of nulls in BsmtExposure : 38
## Number of nulls in BsmtFinType1 : 37
## Number of nulls in BsmtFinType2 : 38
## Number of nulls in Electrical : 1
## Number of nulls in FireplaceQu : 690
## Number of nulls in GarageType : 81
## Number of nulls in GarageYrBlt : 81
## Number of nulls in GarageFinish : 81
## Number of nulls in GarageQual : 81
## Number of nulls in GarageCond : 81
## Number of nulls in PoolQC : 1453
## Number of nulls in Fence : 1179
## Number of nulls in MiscFeature : 1406
There are 19 variables with NA values. It is important to note that this might mean that the observation is missing or perhaps a NA means somthing itself. We must consult the data dictionary.
LotFrontage has 259 NA values. This is integer value repersenting Linear feet of street connected to property. Looking into the dataset, we see that the other observations have a value anywhere from 21-313. We can safetly assume that these 259 entries repersent a house that has 0 linear feet of street being connected to the property. Of course it could be a case of missin value, but it is possible that the homes do not have the property connecting to a steet, we take this assumptions and instead change the NAs to 0. As such, We will not be removing such observations
v = 1
while (v<1461){
if (is.na(modified_data$LotFrontage[v])){
modified_data$LotFrontage[v] = 0
}
v = v + 1
}
Alley has 1369 NAs, this high number suggests that the NAs must mean somthing rather than missing value. The dictionary shows us the NA repersents no alley Access. Rather than Na, lets change that to ‘None’ a bit more repersentative.
v = 1
while (v<1461){
if (is.na(modified_data$Alley[v])){
modified_data$Alley[v] = "None"
}
v = v + 1
}
MasVnrType and MasVnrArea both have 8 missing values. This is very suspeious. Could it be that they are of the same observations and due to another house feature? Let us check. Maybe our data dictionary can help us with this.
MasVnr <- subset(raw_data, is.na(raw_data$MasVnrType))
MasVnr$Area <- subset(raw_data, is.na(raw_data$MasVnrArea))
MasVnr
These both variables are Na in the same observation. What is weird is that MasVnrType has a None category, so that means that it isn’t a case that there was no masonry veneer, rather I would say that the Masonry data was not collected for these 8 observations. This would tell me that it might be good to remove the observations
modified_data <- subset(modified_data, !is.na(raw_data$MasVnrType)) # we only do it for when one attribute isnt na, because the other overlaps.
Next lets look at BsmtQual, BsmtCond, BsmtExposure, BsmtFinType1, BsmtFinType2. All these have 37-38 missing values. Lets see if they are overlapping observations again, and see if we can figure out a pattern.
Bsmt <- subset(raw_data, is.na(raw_data$BsmtExposure))
Bsmt
Here we can see that they all overlap and there is a pattern here. They are all basement, perhaps there is somthing odd about the basement. Looking in the data dictionary confirms our suspicions, NA repersents No basements for all of there variables. These are not missing values, and NA is a very meaningful entry. I don’t like the use of NA, I would rather use a more descriptive categorical name: NoB
v = 1
while (v<1453){
if (is.na(modified_data$BsmtQual[v])){
modified_data$BsmtQual[v] = "NoB"
}
if (is.na(modified_data$BsmtCond[v])){
modified_data$BsmtCond[v] = "NoB"
}
if (is.na(modified_data$BsmtExposure[v])){
modified_data$BsmtExposure[v] = "NoB"
}
if (is.na(modified_data$BsmtFinType1[v])){
modified_data$BsmtFinType1[v] = "NoB"
}
if (is.na(modified_data$BsmtFinType2[v])){
modified_data$BsmtFinType2[v] = "NoB"
}
v = v + 1
}
Bsmt1 <- subset(modified_data, modified_data$BsmtExposure == "NoB")
Bsmt1
Bsmt2 <- subset(modified_data, modified_data$BsmtFinType2 == "NoB")
Bsmt2
It’s important to note that observation 949 has a basement that is unfinished and the Exposure is set to NA. This is a potential missing value, only because we know that the other categorical variables label this observation as UNFINSHED rather than NO BASEMENT. The exposture could have been set at No exposure, but rather was set to a level repersenting no basement. I think it’s safe to remove this data observation.
Also observation 333 has a signular basement with FinType2 as NA. FinType1 does have a value, however, and after looking through other observations, this stikes as very odd. FinType repersents the finishing of the basement, FinType1 repersents the first layer, and FinType2 repersents any additional layers (if there are any), however in the event there is one layer, other entries would have FinType2 as UNF or No Basement. I think it is safe to eliminate this observation.
modified_data <- modified_data[-c(949),]
modified_data <- modified_data[-c(333),]
Poof! The two records are gone.
For the misisng electrical data, we will be removing it, because NA has no meaning behind it and there needs to be an option for it.
modified_data <- subset(modified_data, !is.na(modified_data$Electrical))
Poof! It’s gone!
For FireplaceQu, I looked ahead at the data dictionary, and it clearly states all NAs means no Fireplace, so we can attribute this to a better categorical variable: NoF
v = 1
while (v<1450){
if (is.na(modified_data$FireplaceQu[v])){
modified_data$FireplaceQu[v] = "NoF"
}
v = v + 1
}
And now again we have 5 variables that describe the same part of the house, the garage (GarageType, GarageYrBlt, GarageFinish, GarageQual, GarageCond), and they have equal amounts of NA (81). And to no suprise, NA for each of those variables means No garage. We’ll chnage this to NoG instead.
v = 1
while (v<1450){
if (is.na(modified_data$GarageType[v])){
modified_data$GarageType[v] = "NoG"
}
if (is.na(modified_data$GarageYrBlt[v])){
modified_data$GarageYrBlt[v] = "NoG"
}
if (is.na(modified_data$GarageFinish[v])){
modified_data$GarageFinish[v] = "NoG"
}
if (is.na(modified_data$GarageQual[v])){
modified_data$GarageQual[v] = "NoG"
}
if (is.na(modified_data$GarageCond[v])){
modified_data$GarageCond[v] = "NoG"
}
v = v + 1
}
I found it that the next three attributes had a lot of NA entires. So I looked into the data dictionary, they all repersent the missing item for the attribute. They are not missing values, so will not be excluded, but given better names. NA for PoolQc will be chnaged to NoP, NA for Fence will be NoF, and NA for MiscFeature will become NoM.
v = 1
while (v<1450){
if (is.na(modified_data$PoolQC[v])){
modified_data$PoolQC[v] = "NoP"
}
if (is.na(modified_data$Fence[v])){
modified_data$Fence[v] = "NoF"
}
if (is.na(modified_data$MiscFeature[v])){
modified_data$MiscFeature[v] = "NoM"
}
v = v + 1
}
We should be done will addressing missing values, lets check!
x = 1
cat("---NULL COUNT---\n")
## ---NULL COUNT---
while (x<81){
if(sum(is.na((modified_data[x]))>0)){
cat("Number of nulls in ",(colnames(modified_data[x])), ": ")
cat(sum(is.na(modified_data[x])), "\n")
}
x = x + 1
}
outlier_sales <- subset(modified_data, modified_data$SalePrice > 340000)
outlier_sales[order(outlier_sales$SalePrice),]
modified_data$MSSubClass <- formatC(modified_data$MSSubClass)
modified_data$BsmtFullBath <- formatC(modified_data$BsmtFullBath)
modified_data$BsmtHalfBath <- formatC(modified_data$BsmtHalfBath)
modified_data$FullBath <- formatC(modified_data$FullBath)
modified_data$HalfBath <- formatC(modified_data$HalfBath)
modified_data$BedroomAbvGr <- formatC(modified_data$BedroomAbvGr)
modified_data$KitchenAbvGr <- formatC(modified_data$KitchenAbvGr)
modified_data$TotRmsAbvGrd <- formatC(modified_data$TotRmsAbvGrd)
modified_data$Fireplaces <- formatC(modified_data$Fireplaces)
modified_data$MoSold <- formatC(modified_data$MoSold)
modified_data$YrSold <- formatC(modified_data$YrSold)
modified_data$OverallQual <- formatC(modified_data$OverallQual)
modified_data$OverallCond <- formatC(modified_data$OverallCond)
# I will be using temp as an alt to the actual modified dataset until everything is finalized.
colname <- colnames(modified_data)
temp = modified_data
#remeber which attributes to remove
toremove = as.vector(0)
outers= as.vector(0)
gg = 0
attrib = 0
global = 0
This will be a strainious process, but I will be going thru each variable. we shall see if there is a need to reduce the levels for the categorical ones, and if outliers need to be dealt with for the numerical ones.
attrib = attrib + 1
colname[attrib]
## [1] "Id"
if (typeof(temp[,attrib]) == 'integer'){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
## [1] 0
## numeric(0)
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
# global = global + 1
# toremove[global] = attrib
}
Obviously there is no outlier problem with our ID variables. Nothing to be removed.
attrib = attrib + 1
colname[attrib]
## [1] "MSSubClass"
if (typeof(temp[,attrib]) == 'integer'){
graph = boxplot(temp[,attrib])
length(graph$out)
graph$out
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
# Reassigning levels
temp$MSSubClass <- factor(temp$MSSubClass)
levels(temp$MSSubClass) <- list(One_Story=c("20","30","40","45", "50"), Two_Story=c("60","70","75"), SplitDuplex=c("80","85","90"), PUD=c("120","160","180","190"))
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
# global = global + 1
# toremove[global] = attrib
}
## Var1 Freq
## 1 120 85
## 2 160 63
## 3 180 10
## 4 190 30
## 5 20 532
## 6 30 69
## 7 40 4
## 8 45 12
## 9 50 144
## 10 60 296
## 11 70 60
## 12 75 16
## 13 80 57
## 14 85 19
## 15 90 52
## Var1 Freq
## 1 One_Story 761
## 2 Two_Story 372
## 3 SplitDuplex 128
## 4 PUD 188
Reduced the number of factors
attrib = attrib + 1
colname[attrib]
## [1] "MSZoning"
if (typeof(temp[,attrib]) == 'integer'){
graph = boxplot(temp[,attrib])
length(graph$out)
graph$out
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
global = global + 1
toremove[global] = attrib
}
## Var1 Freq
## 1 C (all) 10
## 2 FV 62
## 3 RH 16
## 4 RL 1144
## 5 RM 217
I decided that the number of categorical factors are appropriate, and do not need to reduce. However Judging by the variance, Probably will be looking to remove this one. The distribution is too skewed. Variance looks too little.
attrib = attrib + 1
colname[attrib]
## [1] "LotFrontage"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5]|temp[,attrib]<graph$stats[1],)
temp[outlierID,]
outlier_lotFrontage = subset(temp, temp[,attrib]>graph$stats[5])
gg = gg+1
outers[gg] = attrib
}
## [1] 16
## [1] 141 174 174 140 150 137 144 149 313 168 182 138 160 152 313 153
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
# global = global + 1
# toremove[global] = attrib
}
I flagged all the potential outliers into the var outlier. WIll not remove yet because i might just be removing this attribute all together.
attrib = attrib + 1
colname[attrib]
## [1] "LotArea"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5]|temp[,attrib]<graph$stats[1],)
temp[outlierID,]
outlier_lotArea = subset(temp, temp[,attrib]>graph$stats[5]|temp[,attrib]<graph$stats[1])
gg = gg+1
outers[gg] = attrib
}
## [1] 67
## [1] 50271 19900 21000 21453 19378 31770 22950 25419 159000 19296
## [11] 39104 19138 18386 215245 164660 20431 18800 53107 34650 22420
## [21] 21750 70761 53227 40094 21872 21780 25095 46589 20896 18450
## [31] 21535 26178 115149 21695 53504 21384 28698 45600 17920 25286
## [41] 27650 24090 25000 1300 21286 21750 29959 18000 23257 17755
## [51] 35760 18030 35133 32463 18890 24682 23595 17871 36500 63887
## [61] 20781 25339 57200 20544 19690 21930 26142
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
# global = global + 1
# toremove[global] = attrib
}
attrib = attrib + 1
colname[attrib]
## [1] "Street"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
global = global + 1
toremove[global] = attrib
}
## Var1 Freq
## 1 Grvl 6
## 2 Pave 1443
Probably will be looking to remove this one. The distribution is too skewed. Variance looks too little.
attrib = attrib + 1
colname[attrib]
## [1] "Alley"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
global = global + 1
toremove[global] = attrib
}
## Var1 Freq
## 1 Grvl 50
## 2 None 1359
## 3 Pave 40
Probably will be looking to remove this one. The distribution is too skewed. Variance looks too little.
attrib = attrib + 1
colname[attrib]
## [1] "LotShape"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
# Reassigning levels
temp$LotShape <- factor(temp$LotShape)
levels(temp$LotShape) <- list(IR=c("IR1","IR2","IR3"), Reg="Reg")
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
# global = global + 1
# toremove[global] = attrib
}
## Var1 Freq
## 1 IR1 482
## 2 IR2 41
## 3 IR3 10
## 4 Reg 916
## Var1 Freq
## 1 IR 533
## 2 Reg 916
After looking at the distribution (variances) in each level and how closely related IR1,IR2,IR3 were to each other, I decided to group them. This will level the distribution a bit.
attrib = attrib + 1
colname[attrib]
## [1] "LandContour"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
global = global + 1
toremove[global] = attrib
}
## Var1 Freq
## 1 Bnk 63
## 2 HLS 50
## 3 Low 36
## 4 Lvl 1300
Will be dropped for sure.
attrib = attrib + 1
colname[attrib]
## [1] "Utilities"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
global = global + 1
toremove[global] = attrib
}
## Var1 Freq
## 1 AllPub 1448
## 2 NoSeWa 1
Will be dropped for sure.
attrib = attrib + 1
colname[attrib]
## [1] "LotConfig"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
global = global + 1
toremove[global] = attrib
}
## Var1 Freq
## 1 Corner 262
## 2 CulDSac 93
## 3 FR2 47
## 4 FR3 3
## 5 Inside 1044
Will drop.
attrib = attrib + 1
colname[attrib]
## [1] "LandSlope"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
global = global + 1
toremove[global] = attrib
}
## Var1 Freq
## 1 Gtl 1371
## 2 Mod 65
## 3 Sev 13
Will drop
attrib = attrib + 1
colname[attrib]
## [1] "Neighborhood"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
## Var1 Freq
## 1 Blmngtn 17
## 2 Blueste 2
## 3 BrDale 16
## 4 BrkSide 58
## 5 ClearCr 28
## 6 CollgCr 148
## 7 Crawfor 50
## 8 Edwards 100
## 9 Gilbert 78
## 10 IDOTRR 37
## 11 MeadowV 17
## 12 Mitchel 49
## 13 NAmes 225
## 14 NoRidge 41
## 15 NPkVill 9
## 16 NridgHt 75
## 17 NWAmes 73
## 18 OldTown 113
## 19 Sawyer 74
## 20 SawyerW 58
## 21 Somerst 83
## 22 StoneBr 25
## 23 SWISU 25
## 24 Timber 37
## 25 Veenker 11
Will not touch this because i feel it will be very important.
attrib = attrib + 1
colname[attrib]
## [1] "Condition1"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
global = global + 1
toremove[global] = attrib
}
## Var1 Freq
## 1 Artery 48
## 2 Feedr 81
## 3 Norm 1249
## 4 PosA 8
## 5 PosN 19
## 6 RRAe 11
## 7 RRAn 26
## 8 RRNe 2
## 9 RRNn 5
Distribution is very bad, will drop
attrib = attrib + 1
colname[attrib]
## [1] "Condition2"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
global = global + 1
toremove[global] = attrib
}
## Var1 Freq
## 1 Artery 2
## 2 Feedr 6
## 3 Norm 1434
## 4 PosA 1
## 5 PosN 2
## 6 RRAe 1
## 7 RRAn 1
## 8 RRNn 2
Even worse vairance, will drop.
attrib = attrib + 1
colname[attrib]
## [1] "BldgType"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
global = global + 1
toremove[global] = attrib
}
## Var1 Freq
## 1 1Fam 1211
## 2 2fmCon 31
## 3 Duplex 52
## 4 Twnhs 43
## 5 TwnhsE 112
I like the idea of this attribute, unfortunatly the vaiance is not within my ruleset
attrib = attrib + 1
colname[attrib]
## [1] "HouseStyle"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
## Var1 Freq
## 1 1.5Fin 154
## 2 1.5Unf 14
## 3 1Story 720
## 4 2.5Fin 8
## 5 2.5Unf 11
## 6 2Story 442
## 7 SFoyer 36
## 8 SLvl 64
Although the distribution is not that good, we shall leave it as is because we want to perserve the information it gives. Perhaps later we will remove it.
attrib = attrib + 1
colname[attrib]
## [1] "OverallQual"
if (colname[attrib] == 'OverallQual'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
## Var1 Freq
## 1 1 2
## 2 10 17
## 3 2 3
## 4 3 20
## 5 4 116
## 6 5 395
## 7 6 372
## 8 7 314
## 9 8 167
## 10 9 43
if (typeof(temp[,attrib]) == 'integer' & colname[attrib] != 'OverallQual'){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
## Var1 Freq
## 1 1 2
## 2 10 17
## 3 2 3
## 4 3 20
## 5 4 116
## 6 5 395
## 7 6 372
## 8 7 314
## 9 8 167
## 10 9 43
There is no reason for us to be chnaging this. Changing to quality ranges would not help the distribution (ex 1-3, 4-7, 8-10). Will keep as is.
attrib = attrib + 1
colname[attrib]
## [1] "OverallCond"
if (colname[attrib] == 'OverallCond'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
## Var1 Freq
## 1 1 1
## 2 2 5
## 3 3 24
## 4 4 57
## 5 5 813
## 6 6 251
## 7 7 205
## 8 8 71
## 9 9 22
if (typeof(temp[,attrib]) == 'integer' & colname[attrib] != 'OverallCond'){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
## Var1 Freq
## 1 1 1
## 2 2 5
## 3 3 24
## 4 4 57
## 5 5 813
## 6 6 251
## 7 7 205
## 8 8 71
## 9 9 22
Again, the distribution does not look too good towards the extremes, but there is no way to fix this without removing information. Will keep
attrib = attrib + 1
colname[attrib]
## [1] "YearBuilt"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
outlier_yearbuilt = subset(temp, temp[,attrib]>graph$stats[5]|temp[,attrib]<graph$stats[1])
gg = gg+1
outers[gg] = attrib
}
## [1] 7
## [1] 1880 1880 1880 1882 1880 1875 1872
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
attrib = attrib + 1
colname[attrib]
## [1] "YearRemodAdd"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
## [1] 0
## numeric(0)
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
attrib = attrib + 1
colname[attrib]
## [1] "RoofStyle"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
## Var1 Freq
## 1 Flat 13
## 2 Gable 1131
## 3 Gambrel 11
## 4 Hip 285
## 5 Mansard 7
## 6 Shed 2
attrib = attrib + 1
colname[attrib]
## [1] "RoofMatl"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
global = global + 1
toremove[global] = attrib
}
## Var1 Freq
## 1 ClyTile 1
## 2 CompShg 1423
## 3 Membran 1
## 4 Metal 1
## 5 Roll 1
## 6 Tar&Grv 11
## 7 WdShake 5
## 8 WdShngl 6
will remove this this attrib
attrib = attrib + 1
colname[attrib]
## [1] "Exterior1st"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
## Var1 Freq
## 1 AsbShng 20
## 2 AsphShn 1
## 3 BrkComm 2
## 4 BrkFace 50
## 5 CBlock 1
## 6 CemntBd 59
## 7 HdBoard 222
## 8 ImStucc 1
## 9 MetalSd 220
## 10 Plywood 108
## 11 Stone 2
## 12 Stucco 25
## 13 VinylSd 508
## 14 Wd Sdng 205
## 15 WdShing 25
attrib = attrib + 1
colname[attrib]
## [1] "Exterior2nd"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
## Var1 Freq
## 1 AsbShng 20
## 2 AsphShn 3
## 3 Brk Cmn 7
## 4 BrkFace 25
## 5 CBlock 1
## 6 CmentBd 58
## 7 HdBoard 206
## 8 ImStucc 10
## 9 MetalSd 214
## 10 Other 1
## 11 Plywood 142
## 12 Stone 4
## 13 Stucco 26
## 14 VinylSd 497
## 15 Wd Sdng 197
## 16 Wd Shng 38
attrib = attrib + 1
colname[attrib]
## [1] "MasVnrType"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
## Var1 Freq
## 1 BrkCmn 15
## 2 BrkFace 445
## 3 None 862
## 4 Stone 127
attrib = attrib + 1
colname[attrib]
## [1] "MasVnrArea"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
outlier_masVnrArea = subset(temp, temp[,attrib]>graph$stats[5]|temp[,attrib]<graph$stats[1])
gg = gg+1
outers[gg] = attrib
}
## [1] 96
## [1] 640 650 456 1031 573 1115 576 443 468 600 768 480 1129 436
## [15] 456 664 653 491 748 456 922 506 604 472 481 1600 616 870
## [29] 530 500 510 650 432 473 772 435 562 921 762 594 479 584
## [43] 420 459 452 513 472 660 528 464 1170 630 466 651 442 894
## [57] 513 673 603 860 424 1047 442 816 760 541 423 424 975 450
## [71] 423 571 480 425 660 1378 456 425 420 766 554 632 567 451
## [85] 621 788 796 428 564 579 705 731 420 448 426 438
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
attrib = attrib + 1
colname[attrib]
## [1] "ExterQual"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
## Var1 Freq
## 1 Ex 51
## 2 Fa 14
## 3 Gd 480
## 4 TA 904
attrib = attrib + 1
colname[attrib]
## [1] "ExterCond"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
## Var1 Freq
## 1 Ex 3
## 2 Fa 28
## 3 Gd 145
## 4 Po 1
## 5 TA 1272
attrib = attrib + 1
colname[attrib]
## [1] "Foundation"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
## Var1 Freq
## 1 BrkTil 146
## 2 CBlock 633
## 3 PConc 637
## 4 Slab 24
## 5 Stone 6
## 6 Wood 3
attrib = attrib + 1
colname[attrib]
## [1] "BsmtQual"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
## Var1 Freq
## 1 Ex 120
## 2 Fa 35
## 3 Gd 609
## 4 NoB 37
## 5 TA 648
attrib = attrib + 1
colname[attrib]
## [1] "BsmtCond"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
global = global + 1
toremove[global] = attrib
}
## Var1 Freq
## 1 Fa 45
## 2 Gd 64
## 3 NoB 37
## 4 Po 2
## 5 TA 1301
Might drop too
attrib = attrib + 1
colname[attrib]
## [1] "BsmtExposure"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
## Var1 Freq
## 1 Av 219
## 2 Gd 133
## 3 Mn 114
## 4 No 945
## 5 NoB 38
attrib = attrib + 1
colname[attrib]
## [1] "BsmtFinType1"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
## Var1 Freq
## 1 ALQ 220
## 2 BLQ 148
## 3 GLQ 411
## 4 LwQ 74
## 5 NoB 37
## 6 Rec 132
## 7 Unf 427
attrib = attrib + 1
colname[attrib]
## [1] "BsmtFinSF1"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
outlier_bsmtFinSF1 = subset(temp, temp[,attrib]>graph$stats[5]|temp[,attrib]<graph$stats[1])
gg = gg+1
outers[gg] = attrib
}
## [1] 8
## [1] 1810 1880 1904 1767 2260 2188 2096 5644
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
attrib = attrib + 1
colname[attrib]
## [1] "BsmtFinType2"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
## Var1 Freq
## 1 ALQ 19
## 2 BLQ 33
## 3 GLQ 14
## 4 LwQ 46
## 5 NoB 38
## 6 Rec 54
## 7 Unf 1245
attrib = attrib + 1
colname[attrib]
## [1] "BsmtFinSF2"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
outlier_bsmtFinSF2 = subset(temp, temp[,attrib]>graph$stats[5]|temp[,attrib]<graph$stats[1])
gg = gg+1
outers[gg] = attrib
}
## [1] 167
## [1] 32 668 486 93 491 506 712 362 41 169 869 150 670 28
## [15] 1080 181 768 215 374 208 441 184 279 306 180 712 580 690
## [29] 692 228 125 1063 620 175 820 1474 264 479 147 232 380 544
## [43] 294 258 121 180 391 531 344 539 713 210 311 1120 165 532
## [57] 279 96 495 180 174 1127 139 202 645 123 551 219 606 147
## [71] 612 480 182 132 336 468 287 35 499 180 180 723 119 182
## [85] 40 551 117 239 80 472 64 1057 127 630 480 128 377 764
## [99] 345 539 1085 435 823 500 290 324 634 411 841 1061 93 466
## [113] 396 354 294 149 193 117 273 465 400 468 41 682 64 557
## [127] 230 106 791 240 287 547 391 469 177 108 374 600 492 211
## [141] 168 96 1031 438 375 144 81 906 608 276 661 68 173 972
## [155] 105 420 469 546 334 352 872 374 110 627 163 1029 290
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
attrib = attrib + 1
colname[attrib]
## [1] "BsmtUnfSF"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
outlier_bsmtUnfSF = subset(temp, temp[,attrib]>graph$stats[5]|temp[,attrib]<graph$stats[1])
gg = gg+1
outers[gg] = attrib
}
## [1] 29
## [1] 1777 1768 1907 1686 2336 1694 2121 1869 2153 1969 1709 2042 1774 2046
## [15] 1836 1935 1926 1734 1800 1753 1905 1800 1710 1752 1694 1689 2002 1753
## [29] 1795
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
attrib = attrib + 1
colname[attrib]
## [1] "TotalBsmtSF"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
outlier_totalBsmtSF = subset(temp, temp[,attrib]>graph$stats[5]|temp[,attrib]<graph$stats[1])
gg = gg+1
outers[gg] = attrib
}
## [1] 60
## [1] 0 0 2223 0 0 0 2216 0 2392 0 2121 2136 3206 0
## [15] 0 0 0 3094 2153 3200 0 3138 0 0 0 0 2109 2077
## [29] 2444 0 0 0 0 2078 0 2217 0 0 2330 0 0 0
## [43] 0 2524 0 0 0 0 0 2396 2158 0 0 2136 0 2110
## [57] 6110 0 2633 0
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
attrib = attrib + 1
colname[attrib]
## [1] "Heating"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
global = global + 1
toremove[global] = attrib
}
## Var1 Freq
## 1 Floor 1
## 2 GasA 1417
## 3 GasW 18
## 4 Grav 7
## 5 OthW 2
## 6 Wall 4
attrib = attrib + 1
colname[attrib]
## [1] "HeatingQC"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
## Var1 Freq
## 1 Ex 733
## 2 Fa 49
## 3 Gd 239
## 4 Po 1
## 5 TA 427
attrib = attrib + 1
colname[attrib]
## [1] "CentralAir"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
global = global + 1
toremove[global] = attrib
}
## Var1 Freq
## 1 N 95
## 2 Y 1354
attrib = attrib + 1
colname[attrib]
## [1] "Electrical"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
global = global + 1
toremove[global] = attrib
}
## Var1 Freq
## 1 FuseA 94
## 2 FuseF 27
## 3 FuseP 3
## 4 Mix 1
## 5 SBrkr 1324
attrib = attrib + 1
colname[attrib]
## [1] "X1stFlrSF"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
outlier_x1stFlrSF = subset(temp, temp[,attrib]>graph$stats[5]|temp[,attrib]<graph$stats[1])
gg = gg+1
outers[gg] = attrib
}
## [1] 19
## [1] 2207 2223 2259 2158 2234 2392 2402 3228 3138 2444 2217 2364 2898 2524
## [15] 2411 2196 4692 2156 2633
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
attrib = attrib + 1
colname[attrib]
## [1] "X2ndFlrSF"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
outlier_x2stFlrSF = subset(temp, temp[,attrib]>graph$stats[5]|temp[,attrib]<graph$stats[1])
gg = gg+1
outers[gg] = attrib
}
## [1] 2
## [1] 1872 2065
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
attrib = attrib + 1
colname[attrib]
## [1] "LowQualFinSF"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
outlier_lowQualFinSF = subset(temp, temp[,attrib]>graph$stats[5]|temp[,attrib]<graph$stats[1])
gg = gg+1
outers[gg] = attrib
}
## [1] 26
## [1] 360 513 234 528 572 144 392 371 390 420 473 156 515 360 80 80 53
## [18] 232 481 120 514 397 479 205 80 384
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
attrib = attrib + 1
colname[attrib]
## [1] "GrLivArea"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
outlier_grLivArea = subset(temp, temp[,attrib]>graph$stats[5]|temp[,attrib]<graph$stats[1])
gg = gg+1
outers[gg] = attrib
}
## [1] 31
## [1] 2945 3222 3608 3112 2794 3493 2978 3228 4676 2775 3194 3395 4316 3279
## [15] 3140 2822 2872 2898 3082 2868 2828 3627 3086 2872 4476 3447 5642 2810
## [29] 2792 3238 2784
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
attrib = attrib + 1
colname[attrib]
## [1] "BsmtFullBath"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
## Var1 Freq
## 1 0 852
## 2 1 581
## 3 2 15
## 4 3 1
attrib = attrib + 1
colname[attrib]
## [1] "BsmtHalfBath"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
## Var1 Freq
## 1 0 1368
## 2 1 79
## 3 2 2
attrib = attrib + 1
colname[attrib]
## [1] "FullBath"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
## Var1 Freq
## 1 0 9
## 2 1 648
## 3 2 760
## 4 3 32
attrib = attrib + 1
colname[attrib]
## [1] "HalfBath"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
## Var1 Freq
## 1 0 908
## 2 1 529
## 3 2 12
attrib = attrib + 1
colname[attrib]
## [1] "BedroomAbvGr"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
## Var1 Freq
## 1 0 6
## 2 1 49
## 3 2 354
## 4 3 800
## 5 4 211
## 6 5 21
## 7 6 7
## 8 8 1
attrib = attrib + 1
colname[attrib]
## [1] "KitchenAbvGr"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
## Var1 Freq
## 1 0 1
## 2 1 1382
## 3 2 64
## 4 3 2
attrib = attrib + 1
colname[attrib]
## [1] "KitchenQual"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
outlier_bedroomAbvGr = subset(temp, temp[,attrib]>graph$stats[5]|temp[,attrib]<graph$stats[1])
gg = gg+1
outers[gg] = attrib
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
## Var1 Freq
## 1 Ex 99
## 2 Fa 39
## 3 Gd 578
## 4 TA 733
attrib = attrib + 1
colname[attrib]
## [1] "TotRmsAbvGrd"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
## Var1 Freq
## 1 10 47
## 2 11 18
## 3 12 11
## 4 14 1
## 5 2 1
## 6 3 17
## 7 4 96
## 8 5 273
## 9 6 400
## 10 7 325
## 11 8 186
## 12 9 74
attrib = attrib + 1
colname[attrib]
## [1] "Functional"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
global = global + 1
toremove[global] = attrib
}
## Var1 Freq
## 1 Maj1 13
## 2 Maj2 5
## 3 Min1 31
## 4 Min2 34
## 5 Mod 15
## 6 Sev 1
## 7 Typ 1350
attrib = attrib + 1
colname[attrib]
## [1] "Fireplaces"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
## Var1 Freq
## 1 0 684
## 2 1 647
## 3 2 113
## 4 3 5
attrib = attrib + 1
colname[attrib]
## [1] "FireplaceQu"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
## Var1 Freq
## 1 Ex 24
## 2 Fa 33
## 3 Gd 377
## 4 NoF 684
## 5 Po 20
## 6 TA 311
attrib = attrib + 1
colname[attrib]
## [1] "GarageType"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
## Var1 Freq
## 1 2Types 6
## 2 Attchd 862
## 3 Basment 19
## 4 BuiltIn 86
## 5 CarPort 9
## 6 Detchd 386
## 7 NoG 81
attrib = attrib + 1
colname[attrib]
## [1] "GarageYrBlt"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
## Var1 Freq
## 1 1900 1
## 2 1906 1
## 3 1908 1
## 4 1910 3
## 5 1914 2
## 6 1915 2
## 7 1916 5
## 8 1918 2
## 9 1920 14
## 10 1921 3
## 11 1922 5
## 12 1923 3
## 13 1924 3
## 14 1925 10
## 15 1926 6
## 16 1927 1
## 17 1928 4
## 18 1929 2
## 19 1930 8
## 20 1931 4
## 21 1932 3
## 22 1933 1
## 23 1934 2
## 24 1935 4
## 25 1936 5
## 26 1937 2
## 27 1938 3
## 28 1939 9
## 29 1940 14
## 30 1941 10
## 31 1942 2
## 32 1945 4
## 33 1946 4
## 34 1947 2
## 35 1948 11
## 36 1949 8
## 37 1950 24
## 38 1951 6
## 39 1952 3
## 40 1953 12
## 41 1954 19
## 42 1955 13
## 43 1956 16
## 44 1957 20
## 45 1958 21
## 46 1959 17
## 47 1960 19
## 48 1961 13
## 49 1962 21
## 50 1963 16
## 51 1964 18
## 52 1965 21
## 53 1966 21
## 54 1967 15
## 55 1968 26
## 56 1969 15
## 57 1970 20
## 58 1971 13
## 59 1972 14
## 60 1973 14
## 61 1974 17
## 62 1975 8
## 63 1976 29
## 64 1977 35
## 65 1978 19
## 66 1979 15
## 67 1980 15
## 68 1981 10
## 69 1982 4
## 70 1983 7
## 71 1984 8
## 72 1985 10
## 73 1986 6
## 74 1987 11
## 75 1988 14
## 76 1989 10
## 77 1990 16
## 78 1991 9
## 79 1992 13
## 80 1993 22
## 81 1994 18
## 82 1995 18
## 83 1996 20
## 84 1997 19
## 85 1998 31
## 86 1999 30
## 87 2000 27
## 88 2001 20
## 89 2002 24
## 90 2003 49
## 91 2004 52
## 92 2005 65
## 93 2006 58
## 94 2007 45
## 95 2008 29
## 96 2009 21
## 97 2010 3
## 98 NoG 81
I will not remove this because it shows a pattern.
attrib = attrib + 1
colname[attrib]
## [1] "GarageFinish"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
## Var1 Freq
## 1 Fin 346
## 2 NoG 81
## 3 RFn 417
## 4 Unf 605
attrib = attrib + 1
colname[attrib]
## [1] "GarageCars"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
outlier_garagecars = subset(temp, temp[,attrib]>graph$stats[5]|temp[,attrib]<graph$stats[1])
gg = gg+1
outers[gg] = attrib
}
## [1] 5
## [1] 4 4 4 4 4
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
attrib = attrib + 1
colname[attrib]
## [1] "GarageArea"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
outlier_garageArea = subset(temp, temp[,attrib]>graph$stats[5]|temp[,attrib]<graph$stats[1])
gg = gg+1
outers[gg] = attrib
}
## [1] 20
## [1] 1166 968 1053 1025 1390 1134 983 1020 1220 1248 1043 1052 995 1356
## [15] 1052 954 1014 1418 968 1069
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
attrib = attrib + 1
colname[attrib]
## [1] "GarageQual"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
global = global + 1
toremove[global] = attrib
}
## Var1 Freq
## 1 Ex 3
## 2 Fa 48
## 3 Gd 14
## 4 NoG 81
## 5 Po 3
## 6 TA 1300
attrib = attrib + 1
colname[attrib]
## [1] "GarageCond"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
global = global + 1
toremove[global] = attrib
}
## Var1 Freq
## 1 Ex 2
## 2 Fa 35
## 3 Gd 9
## 4 NoG 81
## 5 Po 7
## 6 TA 1315
attrib = attrib + 1
colname[attrib]
## [1] "PavedDrive"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
global = global + 1
toremove[global] = attrib
}
## Var1 Freq
## 1 N 90
## 2 P 30
## 3 Y 1329
attrib = attrib + 1
colname[attrib]
## [1] "WoodDeckSF"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
outlier_woodDeck = subset(temp, temp[,attrib]>graph$stats[5]|temp[,attrib]<graph$stats[1])
gg = gg+1
outers[gg] = attrib
}
## [1] 32
## [1] 857 576 476 574 441 468 670 495 536 519 466 517 426 503 486 486 511
## [18] 421 550 509 474 728 436 431 448 439 635 500 668 586 431 736
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
attrib = attrib + 1
colname[attrib]
## [1] "OpenPorchSF"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
outlier_openPorch = subset(temp, temp[,attrib]>graph$stats[5]|temp[,attrib]<graph$stats[1])
gg = gg+1
outers[gg] = attrib
}
## [1] 75
## [1] 204 213 258 199 234 184 205 228 238 260 198 172 208 228 184 250 175
## [18] 195 214 231 192 187 176 523 285 406 182 502 274 172 243 235 312 267
## [35] 265 288 341 204 174 247 291 312 418 240 364 188 207 234 192 191 252
## [52] 189 282 224 319 244 185 200 180 263 304 234 240 192 198 287 292 207
## [69] 241 547 211 184 262 210 236
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
attrib = attrib + 1
colname[attrib]
## [1] "EnclosedPorch"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
## [1] 207
## [1] 272 228 205 176 205 87 172 102 37 144 64 114 202 128 156 44 77
## [18] 144 192 144 140 180 228 128 183 39 184 40 552 30 126 96 60 150
## [35] 120 202 77 112 252 52 224 234 144 244 268 137 24 108 294 177 218
## [52] 242 91 112 160 130 184 126 169 105 34 96 248 236 120 32 80 115
## [69] 291 184 116 158 112 210 36 156 144 84 148 116 120 136 102 240 54
## [86] 112 39 100 36 189 293 164 40 216 239 112 252 240 180 67 90 120
## [103] 56 112 129 40 98 143 216 234 112 112 70 386 154 185 156 156 134
## [120] 196 264 185 275 96 120 112 116 230 254 68 194 192 34 150 164 112
## [137] 224 32 318 244 48 94 138 108 112 226 192 174 228 19 170 220 128
## [154] 80 115 137 192 252 112 96 176 216 176 214 280 96 116 102 190 236
## [171] 192 84 330 208 145 259 126 264 81 164 42 123 162 100 286 190 168
## [188] 20 301 198 96 221 112 212 50 150 168 112 160 114 216 154 99 158
## [205] 216 252 112
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
global = global + 1
toremove[global] = attrib
}
attrib = attrib + 1
colname[attrib]
## [1] "X3SsnPorch"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
outlier_X3Ss = subset(temp, temp[,attrib]>graph$stats[5]|temp[,attrib]<graph$stats[1])
gg = gg+1
outers[gg] = attrib
}
## [1] 24
## [1] 320 407 130 180 168 180 140 508 238 245 196 144 144 182 168 162 23
## [18] 168 216 96 216 153 290 304
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
attrib = attrib + 1
colname[attrib]
## [1] "ScreenPorch"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
outlier_screenPorch = subset(temp, temp[,attrib]>graph$stats[5]|temp[,attrib]<graph$stats[1])
gg = gg+1
outers[gg] = attrib
}
## [1] 116
## [1] 176 198 291 252 99 184 168 130 142 192 410 224 266 170 154 153 144
## [18] 142 128 259 160 198 271 234 184 374 192 185 182 90 144 224 396 170
## [35] 176 140 276 192 180 161 168 145 200 122 95 144 120 60 120 126 189
## [52] 260 147 385 287 200 156 100 180 216 210 197 204 192 225 192 152 175
## [69] 126 312 222 265 224 322 120 190 233 63 147 180 53 143 189 189 189
## [86] 192 160 160 126 100 273 180 90 288 263 224 147 120 80 163 90 288
## [103] 116 259 224 216 480 120 178 440 155 168 220 119 165 40
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
attrib = attrib + 1
colname[attrib]
## [1] "PoolArea"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
outlier_poolarea = subset(temp, temp[,attrib]>graph$stats[5]|temp[,attrib]<graph$stats[1])
gg = gg+1
outers[gg] = attrib
}
## [1] 7
## [1] 512 648 576 555 480 519 738
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
attrib = attrib + 1
colname[attrib]
## [1] "PoolQC"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
global = global + 1
toremove[global] = attrib
}
## Var1 Freq
## 1 Ex 2
## 2 Fa 2
## 3 Gd 3
## 4 NoP 1442
Tooo big of a variance
attrib = attrib + 1
colname[attrib]
## [1] "Fence"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
global = global + 1
toremove[global] = attrib
}
## Var1 Freq
## 1 GdPrv 59
## 2 GdWo 54
## 3 MnPrv 156
## 4 MnWw 11
## 5 NoF 1169
attrib = attrib + 1
colname[attrib]
## [1] "MiscFeature"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
global = global + 1
toremove[global] = attrib
}
## Var1 Freq
## 1 Gar2 2
## 2 NoM 1395
## 3 Othr 2
## 4 Shed 49
## 5 TenC 1
attrib = attrib + 1
colname[attrib]
## [1] "MiscVal"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
outlier_miscVal = subset(temp, temp[,attrib]>graph$stats[5]|temp[,attrib]<graph$stats[1])
gg = gg+1
outers[gg] = attrib
}
## [1] 52
## [1] 700 350 700 500 400 700 480 400 400 450 450
## [12] 500 450 700 400 15500 1200 800 480 400 2000 2000
## [23] 600 500 600 600 3500 500 400 450 500 1300 1200
## [34] 500 400 54 500 400 400 2000 620 400 560 500
## [45] 700 1400 400 8300 600 1150 2000 2500
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
attrib = attrib + 1
colname[attrib]
## [1] "MoSold"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
## Var1 Freq
## 1 1 58
## 2 10 89
## 3 11 78
## 4 12 58
## 5 2 52
## 6 3 104
## 7 4 140
## 8 5 201
## 9 6 253
## 10 7 233
## 11 8 121
## 12 9 62
attrib = attrib + 1
colname[attrib]
## [1] "YrSold"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
## Var1 Freq
## 1 2006 313
## 2 2007 327
## 3 2008 299
## 4 2009 336
## 5 2010 174
attrib = attrib + 1
colname[attrib]
## [1] "SaleType"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
global = global + 1
toremove[global] = attrib
}
## Var1 Freq
## 1 COD 43
## 2 Con 2
## 3 ConLD 9
## 4 ConLI 5
## 5 ConLw 5
## 6 CWD 4
## 7 New 119
## 8 Oth 3
## 9 WD 1259
To drop because of variance
attrib = attrib + 1
colname[attrib]
## [1] "SaleCondition"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
## Var1 Freq
## 1 Abnorml 101
## 2 AdjLand 4
## 3 Alloca 11
## 4 Family 20
## 5 Normal 1191
## 6 Partial 122
Time to make some decisons.
print("The following attributes are under review to be removed:")
## [1] "The following attributes are under review to be removed:"
colname[toremove]
## [1] "MSZoning" "Street" "Alley" "LandContour" "Utilities"
## [6] "LotConfig" "LandSlope" "Condition1" "Condition2" "BldgType"
## [11] "RoofMatl" "BsmtCond" "Heating" "CentralAir" "Electrical"
## [16] "Functional" "GarageQual" "GarageCond" "PavedDrive" "PoolQC"
## [21] "Fence" "MiscFeature" "SaleType"
print("The following attributes have outliers that need to be addressed:")
## [1] "The following attributes have outliers that need to be addressed:"
colname[outers]
## [1] "LotFrontage" "LotArea" "YearBuilt" "MasVnrArea"
## [5] "BsmtFinSF1" "BsmtFinSF2" "BsmtUnfSF" "TotalBsmtSF"
## [9] "X1stFlrSF" "X2ndFlrSF" "LowQualFinSF" "GrLivArea"
## [13] "GarageCars" "GarageArea" "WoodDeckSF" "OpenPorchSF"
## [17] "X3SsnPorch" "ScreenPorch" "PoolArea" "MiscVal"
We will be working on checking the vairance of each variable. We want high variance.
#The ruleset I am emplaying is if there is one variable with over 65% of the observations or 2 with over 70%, I shall remove the attribute based of low variance.
temp[toremove] <- NULL
We shall look at the matrix of the numeric attributes and remove any that have higher than 0.75 correlation with the dependent variable
#Subsetting for numeric only
nums <- unlist(lapply(temp, is.numeric))
numONLY = temp[,nums]
aa<-cor(numONLY)
ggcorrplot(aa)
#We are only intrested in the Sales price of the matrix
aa[,14]
## Id LotFrontage LotArea YearBuilt YearRemodAdd
## 0.008942765 0.223665282 0.260984687 0.200101716 0.290000837
## MasVnrArea BsmtFinSF1 BsmtFinSF2 BsmtUnfSF TotalBsmtSF
## 0.390639498 0.206730870 -0.009306382 0.240050955 0.453943400
## X1stFlrSF X2ndFlrSF LowQualFinSF GrLivArea GarageCars
## 0.565663750 0.690263374 0.135276720 1.000000000 0.466809564
## GarageArea WoodDeckSF OpenPorchSF EnclosedPorch X3SsnPorch
## 0.469419238 0.249199224 0.330309795 0.005335120 0.020858798
## ScreenPorch PoolArea MiscVal SalePrice
## 0.102209377 0.170843563 -0.002311848 0.710191940
There are some conclusions we can make.
#Might be better for us to remove 4 attributes and create a fuller 1 attribute
temp$HouseArea <- temp$TotalBsmtSF + temp$X1stFlrSF + temp$X2ndFlrSF
# Undo some work from before for this stage.
temp$FullBath <- as.numeric(temp$FullBath)
temp$BsmtFullBath <- as.numeric(temp$BsmtFullBath)
temp$HalfBath <- as.numeric(temp$HalfBath)
temp$BsmtHalfBath <- as.numeric(temp$BsmtHalfBath)
# Why keep so many bathroom attributes. Let us condense instead.
temp$TotalBath <- temp$FullBath + 0.5*(temp$HalfBath) + temp$BsmtFullBath + 0.5*as.numeric(temp$BsmtHalfBath)
# Same with porch
temp$TotalPorchSF <- temp$OpenPorchSF + temp$EnclosedPorch + temp$X3SsnPorch + temp$ScreenPorch + temp$WoodDeckSF
# Lets include the garage now
temp$TotalArea <-temp$HouseArea + temp$GarageArea
#We also know that total basement SF = BsmtFinSF1 + BsmtFinSF2 + BsmtUnfSF. WHy not remove it
#Another reason so remove total basement SF is because of its high correlation to 1stFlrSF
temp$TotalBsmtSF <- NULL
#Do not need house area, if we have total area
temp$HouseArea <- NULL
We will remove OverallQual and GrLiveArea due to its high correlation. (above our 0.7 threshold) GarageCars and Garagearea also have high correlation, and they themselves are highly correlated (0.88), so I shall remove one because they pretty much provide the same amount of information to the dependent variable. I pick Garagearea to remove because there are already a lot of area variables.
Also lets remove the attributes used in the addition, as well.
temp$OverallQual <- NULL
temp$GrLivArea <- NULL
temp$GarageArea <- NULL
temp$X1stFlrSF <-NULL
temp$X2ndFlrSF <- NULL
temp$FullBath <- NULL
temp$HalfBath <-NULL
temp$BsmtFullBath <- NULL
temp$BsmtHalfBath <- NULL
temp$OpenPorchSF <- NULL
temp$EnclosedPorch <- NULL
temp$X3SsnPorch <- NULL
temp$ScreenPorch <- NULL
temp$WoodDeckSF <- NULL
temp$PoolArea <- NULL
After our reduction, lets see our variables
colname_new <- colnames(temp)
colname_new
## [1] "Id" "MSSubClass" "LotFrontage" "LotArea"
## [5] "LotShape" "Neighborhood" "HouseStyle" "OverallCond"
## [9] "YearBuilt" "YearRemodAdd" "RoofStyle" "Exterior1st"
## [13] "Exterior2nd" "MasVnrType" "MasVnrArea" "ExterQual"
## [17] "ExterCond" "Foundation" "BsmtQual" "BsmtExposure"
## [21] "BsmtFinType1" "BsmtFinSF1" "BsmtFinType2" "BsmtFinSF2"
## [25] "BsmtUnfSF" "HeatingQC" "LowQualFinSF" "BedroomAbvGr"
## [29] "KitchenAbvGr" "KitchenQual" "TotRmsAbvGrd" "Fireplaces"
## [33] "FireplaceQu" "GarageType" "GarageYrBlt" "GarageFinish"
## [37] "GarageCars" "MiscVal" "MoSold" "YrSold"
## [41] "SaleCondition" "SalePrice" "TotalBath" "TotalPorchSF"
## [45] "TotalArea"
We removed 25 variables so far, and added 3
#Only intrested in the attributes we added, but lets take a look
nums <- unlist(lapply(temp, is.numeric))
numONLY = temp[,nums]
aa<-cor(numONLY)
ggcorrplot(aa)
#We are only intrested in the Sales price of the matrix
aa[,15]
## Id LotFrontage LotArea YearBuilt YearRemodAdd
## -0.030788859 0.039623156 0.185331915 0.097517499 0.180721296
## MasVnrArea BsmtFinSF1 BsmtFinSF2 BsmtUnfSF LowQualFinSF
## 0.163072782 0.195781596 0.096143692 0.049431460 0.019967892
## GarageCars MiscVal SalePrice TotalBath TotalPorchSF
## 0.236456698 0.003100884 0.389503241 0.313955559 1.000000000
## TotalArea
## 0.397873047
diag(var(numONLY))
## Id LotFrontage LotArea YearBuilt YearRemodAdd
## 1.779660e+05 1.202679e+03 9.998646e+07 9.119621e+02 4.265806e+02
## MasVnrArea BsmtFinSF1 BsmtFinSF2 BsmtUnfSF LowQualFinSF
## 3.283680e+04 2.075615e+05 2.620502e+04 1.956699e+05 2.381903e+03
## GarageCars MiscVal SalePrice TotalBath TotalPorchSF
## 5.609260e-01 2.479934e+05 6.297114e+09 6.144431e-01 2.461454e+04
## TotalArea
## 9.187692e+05
As assumed, the areas will be correlated with one another, highly. Also the sales price is largely correlated with areas, bathrooms. I know that these areas are highly correlated but because we added so many variables into it, I will not be removign them.
Finally, we will look at addressing the outliers. I was holding off for as long as I could because they may potentially be eliminated with the removal of some of the attributes, but we might be at a point where no more attributes are to be taken out.
Lets look for some patterns. A lot of it was discovered from the correlation table, but lets reillustrate some of it here. Truth be told, in our univariant work, we looked at some intresting combinations with the Sales price. I will re highlight these and look at some other combinations.
#Lets look at Lot
plot(temp$LotFrontage, temp$LotArea)
ggplot(data = temp, aes(y=temp$LotFrontage, x=temp$LotShape)) + geom_jitter(aes((temp$LotShape) ))
Nothing meaningful
plot(temp$TotalBath,temp$SalePrice)
plot(temp$TotalArea,temp$SalePrice)
plot(temp$GarageCars, temp$SalePrice)
plot(temp$YearBuilt, temp$SalePrice)
plot(temp$YearRemodAdd, temp$SalePrice)
#These attributes had the highest correlation with sales price. It is evident in these plot.
#Something else we can see from the last prompts is that larger homes would have more bathrooms, and in turn higher prices.
qplot(temp$TotalArea,temp$SalePrice, data = temp, colour= temp$TotalBath)
#Larger homes would also have better quality fireplaces.
qplot(temp$TotalArea,temp$SalePrice, data = temp, colour= temp$FireplaceQu)
#And better exterior quality
qplot(temp$TotalArea,temp$SalePrice, data = temp, colour= temp$ExterQual)
#And better Basement quality
qplot(temp$TotalArea,temp$SalePrice, data = temp, colour= temp$BsmtQual)
#QUality in general makes for a better house price
qplot(temp$ExterQual,temp$SalePrice, data = temp, colour= temp$BsmtQual)
# Very imporant comparisons, used to make decisions. Great to find patterns
ggplot(data = temp, aes(x=temp$ExterQual, y=temp$SalePrice)) + geom_count(aes( factor(temp$ExterQual) ))
ggplot(data = temp, aes(x=temp$KitchenQual, y=temp$SalePrice)) + geom_count(aes(factor(temp$KitchenQual) ))
ggplot(data = temp, aes(x=temp$BsmtQual, y=temp$SalePrice)) + geom_count(aes(factor(temp$BsmtQual) ))
ggplot(data = temp, aes(x=temp$FireplaceQu, y=temp$SalePrice)) + geom_count(aes(factor(temp$FireplaceQu) ))
#ggplot(data = temp, aes(x=temp$Fireplace, y=temp$SalePrice)) + geom_count(aes((temp$Fireplace) ))
qplot(temp$Fireplaces,temp$SalePrice, data = temp)
ggplot(data = temp, aes(x=temp$MiscVal, y=temp$SalePrice)) + geom_count(aes((temp$MiscVal) ))
ggplot(data = temp, aes(x=temp$BsmtFinSF1, y=temp$SalePrice)) + geom_count(aes((temp$BsmtFinSF1) ))
ggplot(data = temp, aes(x=temp$BsmtFinSF2, y=temp$SalePrice)) + geom_count(aes((temp$BsmtFinSF2) ))
ggplot(data = temp, aes(x=temp$BsmtFinType2, y=temp$SalePrice)) + geom_count(aes((temp$BsmtFinType2) )) #This got through the cracks, it should have been removed.
ggplot(data = temp, aes(x=temp$MasVnrArea, y=temp$SalePrice)) + geom_count(aes((temp$MasVnrArea) ))
ggplot(data = temp, aes(x=temp$MasVnrType, y=temp$SalePrice)) + geom_count(aes((temp$MasVnrType) ))
ggplot(data = temp, aes(x=temp$BsmtUnfSF, y=temp$SalePrice)) + geom_count(aes((temp$BsmtUnfSF) ))
mean(temp$BsmtUnfSF)
## [1] 567.4651
nrow(temp[temp$BsmtUnfSF<200,])#I decide to keep this, attribute
## [1] 335
ggplot(data = temp, aes(x=temp$LowQualFinSF, y=temp$SalePrice)) + geom_count(aes((temp$LowQualFinSF) ))
ggplot(data = temp, aes(x=temp$GarageCars, y=temp$SalePrice)) + geom_count(aes((temp$GarageCars) ))
ggplot(data = temp, aes(x=temp$LotFrontage, y=temp$SalePrice)) + geom_count(aes((temp$LotFrontage) ))
nrow(temp[temp$LotFrontage<10,])#I decide to keep this, attribute
## [1] 257
ggplot(data = temp, aes(x=temp$tot, y=temp$SalePrice)) + geom_count(aes((temp$LotFrontage) ))
mean(temp$PoolArea)
## Warning in mean.default(temp$PoolArea): argument is not numeric or logical:
## returning NA
## [1] NA
mean(temp$MiscVal)
## [1] 43.81919
mean(as.numeric(temp$Fireplaces))
## [1] 0.6128364
#After observing the graphics and looking into variances and mean, these need to be removed. The means alone tell a story of how skewed of a picture these give. Because of a lack of normalization, means can show a picture as well.
temp$Fireplace <- NULL
temp$PoolArea <- NULL
temp$MiscVal <- NULL
temp$BsmtFinSF2 <- NULL
temp$MasVnrArea <- NULL
temp$LowQualFinSF <-NULL
I saved this for as late as possible. I wanted to see if most of the attributes will be eliminated before we remove observations due to outliers.
#bsmtfinsf1
flattened_outlier = unlist(outlier_bsmtFinSF1[1], use.names = FALSE)
temp = subset(temp, !(temp$Id %in% flattened_outlier))
#removed bsmtfinsf2
outlier_bsmtFinSF2 = 0
#removed masvnrarea
outlier_masVnrArea = 0
#bsmtUnfSF
flattened_outlier = unlist(outlier_bsmtUnfSF[1], use.names = FALSE)
temp = subset(temp, !(temp$Id %in% flattened_outlier))
#removed garage area
outlier_garageArea = 0
#removed garage car
flattened_outlier = unlist(outlier_garagecars[1], use.names = FALSE)
temp = subset(temp, !(temp$Id %in% flattened_outlier))
#removed grlivarea
outlier_grLivArea = 0
#lotarea
flattened_outlier = unlist(outlier_lotArea[1], use.names = FALSE)
temp = subset(temp, !(temp$Id %in% flattened_outlier))
#lotfrontage
flattened_outlier = unlist(outlier_lotFrontage[1], use.names = FALSE)
temp = subset(temp, !(temp$Id %in% flattened_outlier))
#removed lowqualfin
outlier_lowQualFinSF = 0
#removed misc
outlier_miscVal = 0
#removed OpenPorch
outlier_openPorch = 0
#removed pool
outlier_poolarea = 0
#removed screen porch
outlier_screenPorch= 0
#removed total Basement SF
outlier_totalBsmtSF= 0
#removed wood deck
outlier_woodDeck= 0
#removed 1st flr sf
outlier_x1stFlrSF= 0
#removed 2nd flr sf
outlier_x2stFlrSF= 0
#removed 3Ss
outlier_X3Ss= 0
print("it is a good idea we waited to remove the outliers, otherwise we would have lost a lot of observations only to remove the attributes later")
## [1] "it is a good idea we waited to remove the outliers, otherwise we would have lost a lot of observations only to remove the attributes later"
#I will allow this outliers:
#outlier_yearbuilt
#after all our intial analysis, we can set the data back to our original name
modified_data = temp
modified_data = subset(modified_data, select=-c(SalePrice))
modified_data = subset(modified_data, select=-c(YearRemodAdd))
modified_data$SalePrice = temp$SalePrice
modified_data <- droplevels(modified_data)
#we do not need an id anymore
modified_data = subset(modified_data, select=-c(Id))
# Our Normalizing technique
normalize <- function(x) {
if (is.numeric(x)){
return ((x - min(x)) / (max(x) - min(x))) }
else{
return (x)
}
}
data_norm = as.data.frame(lapply(modified_data[1:38], normalize))
data_norm <- cbind(modified_data$SalePrice, data_norm)
colnames(data_norm)[colnames(data_norm)=="modified_data$SalePrice"] <- "SalePrice"
data_norm_noout <- subset(data_norm, data_norm$SalePrice < boxplot(data_norm$SalePrice)$stats[5,])
numsonly <- unlist(lapply(modified_data, is.numeric))
numarray = modified_data[,numsonly]
#numarray = subset(numarray, select = -c(Id))
fit = kmeans(numarray,4)
plotcluster(numarray,fit$cluster)
#str(fit)
fit = kmodes(numarray, 4)
## Warning in kmodes(numarray, 4): data has numeric coloumns with more than 30
## different levels!
plotcluster(numarray,fit$cluster)
# It is not meaningful to have clustering for categorical variables. I did only numerical values.
In fact, these clusters are not meaningful for the numerical only attributes, either! It would have been a better tool to be used in EDA of 2-5 variables perhaps, but I will do without it.
set.seed(11)
#Splitting training to 80%, test to 20%
index <- sample(1:nrow(data_norm_noout), 0.80 *nrow(data_norm_noout))
data_train <- data_norm[index,]
data_test <- data_norm[-index,]
data_train_noout <- data_norm_noout[index,]
data_test_noout <- data_norm_noout[-index,]
rf_data = modified_data
rf_data$GarageYrBlt = unlist(lapply(modified_data$GarageYrBlt, as.numeric), use.name = FALSE )
## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion
## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion
## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion
## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion
## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion
## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion
## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion
## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion
## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion
## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion
## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion
## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion
## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion
## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion
## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion
## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion
## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion
## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion
## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion
## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion
## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion
## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion
## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion
## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion
## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion
## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion
## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion
## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion
## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion
## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion
## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion
## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion
## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion
## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion
## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion
## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion
## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion
## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion
## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion
## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion
## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion
## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion
## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion
## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion
## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion
## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion
## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion
## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion
## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion
## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion
## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion
## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion
## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion
## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion
## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion
## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion
## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion
## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion
## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion
## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion
## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion
## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion
## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion
## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion
## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion
## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion
## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion
## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion
## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion
## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion
## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion
## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion
## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion
## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion
## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion
## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion
## Warning in lapply(modified_data$GarageYrBlt, as.numeric): NAs introduced by
## coercion
rf_data = subset(rf_data, select= -c(GarageYrBlt))
rdata_norm = as.data.frame(lapply(rf_data[1:38], normalize))
rdata_norm <- cbind(modified_data$SalePrice, rdata_norm)
colnames(rdata_norm)[colnames(rdata_norm)=="modified_data$SalePrice"] <- "SalePrice"
# training to 80%, test to 20%
index <- sample(1:nrow(rdata_norm), 0.80 *nrow(rdata_norm))
rdata_train <- rdata_norm[index,]
rdata_test <- rdata_norm[-index,]
rdata_norm_noout <- subset(rdata_norm, rdata_norm$SalePrice < boxplot(rdata_norm$SalePrice)$stats[5,])
# training to 80%, test to 20%
index <- sample(1:nrow(rdata_norm_noout), 0.80 *nrow(rdata_norm_noout))
rdata_train_noout <- rdata_norm_noout[index,]
rdata_test_noout <- rdata_norm_noout[-index,]
MAE <- function(actual, predicted){
mean(abs(actual- predicted))
}
RMSE <- function(actual, predicted){
sqrt(mean((predicted-actual)^2))
}
#Our label is the Sales price, in col 1
# trainlabel <- data_train[,1]
# testlabel <- data_test[,1]
#Applying KNN
##test_pred <- knn(train = data_train[,2:39], test = data_test[,2:39],cl = data_train[,1], k=9)
#Creating accuracy matrix
##CrossTable(x=testlabel, y=test_pred, prop.chisq=FALSE) # This makes no sense for non-class prediction (i.e Regression)
Learned that KNN might not be a good suit for someone with categorical data within the DF. We shall need to look at regression. shi#3.2.1 Decision Tree 1.
set.seed(11)
tree_model <- tree(rdata_train$SalePrice ~ . , data = rdata_train)
plot(tree_model)
text(tree_model)
tree_pred = predict(tree_model, rdata_test)
MAE(rdata_test$SalePrice, tree_pred)
## [1] 23439.04
RMSE(rdata_test$SalePrice, tree_pred)
## [1] 33673.1
cv_tree = cv.tree(tree_model)
names(cv_tree)
## [1] "size" "dev" "k" "method"
plot (cv_tree$size, cv_tree$dev, type = "b", xlab = "Tree Size", ylab = "MSE")
cv_tree$size[which.min(cv_tree$dev)]
## [1] 10
print("No need to prune, we are using size 9 tree")
## [1] "No need to prune, we are using size 9 tree"
set.seed(11)
m1 <- rpart(data_train$SalePrice ~ ., data = data_train, method = "anova")
m1
## n= 1024
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 1024 4.722550e+12 173891.20
## 2) TotalArea< 0.5482043 778 1.239122e+12 146996.20
## 4) TotalArea< 0.389993 365 2.751843e+11 119109.70
## 8) TotalArea< 0.2779812 87 3.744565e+10 91489.78 *
## 9) TotalArea>=0.2779812 278 1.506000e+11 127753.30 *
## 5) TotalArea>=0.389993 413 4.292363e+11 171641.70
## 10) Neighborhood=Blueste,BrkSide,Edwards,IDOTRR,MeadowV,Mitchel,NAmes,NPkVill,NWAmes,OldTown,Sawyer,SWISU 219 1.256293e+11 152961.30 *
## 11) Neighborhood=Blmngtn,ClearCr,CollgCr,Crawfor,Gilbert,NoRidge,NridgHt,SawyerW,Somerst,StoneBr,Timber,Veenker 194 1.409153e+11 192729.30 *
## 3) TotalArea>=0.5482043 246 1.140887e+12 258949.30
## 6) TotalArea< 0.6956939 188 5.297778e+11 235240.90
## 12) GarageYrBlt=1920,1945,1948,1949,1955,1956,1960,1963,1968,1969,1970,1973,1976,1977,1980,1981,1984,1985,1986,1987,1992,NoG 40 7.571870e+10 175180.10 *
## 13) GarageYrBlt=1926,1932,1935,1950,1953,1957,1959,1961,1974,1983,1988,1989,1990,1991,1993,1994,1995,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009 148 2.707692e+11 251473.60
## 26) BsmtQual=Gd,TA 120 1.216738e+11 239645.30 *
## 27) BsmtQual=Ex 28 6.035381e+10 302166.20 *
## 7) TotalArea>=0.6956939 58 1.629136e+11 335797.10
## 14) GarageYrBlt=1959,1968,1976,1977,1981,1988,1990,1992,1993,1994,1995,1996,1997,1998,1999,2000,2003,2004,2007 42 6.844690e+10 315623.00 *
## 15) GarageYrBlt=1934,1982,2001,2005,2006,2008,2009,2010 16 3.250177e+10 388754.10 *
regress_plot <- rpart.plot(m1, type=2, digits=3, fallen.leaves = TRUE)
p1 <- predict(m1, data_test)
# Two error checking methods.
MAE(data_test$SalePrice, p1)
## [1] 23574.07
RMSE(data_test$SalePrice, p1)
## [1] 33262.01
set.seed(11)
model_RF<- randomForest(rdata_train$SalePrice ~ ., data = rdata_train, proximity=TRUE)
model_RF
##
## Call:
## randomForest(formula = rdata_train$SalePrice ~ ., data = rdata_train, proximity = TRUE)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 12
##
## Mean of squared residuals: 527615323
## % Var explained: 88.29
rf_predict = predict(model_RF,rdata_test)
RMSE(rdata_test$SalePrice,rf_predict)
## [1] 22934.38
MAE(rdata_test$SalePrice,rf_predict)
## [1] 14421.09
# Increasing the tree coutn doesnt significantly decrease the errors
## ## PROBLEM HERE BECAUSE THERE ARE CASES WHERE FACTORS IN TEST ARE NOT PRESENT IN TRAINING
set.seed(11)
rregressive_model <- lm(rdata_train$SalePrice ~ ., data = rdata_train)
plot(rregressive_model)
## Warning: not plotting observations with leverage one:
## 51, 158, 184, 253, 572, 592, 611, 652, 762, 822, 1028
## Warning: not plotting observations with leverage one:
## 51, 158, 184, 253, 572, 592, 611, 652, 762, 822, 1028
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
# Residual vs Fitted: No real pattern between Residuals and Fitted values. THis is good. Residuals arefactors our LR did not consider. We dont want those to have
# patterns. However, the increasing slop after the 250 000 mark is concerning.
# QQ-plot: Not completly m=1 slope: not perfectly linear. Also the tails suggest that our model is light tailed, Still could be considered linear
# Scale_Location: Here we clearly see the most of the data follow an non-linear form. Increasing
# FInally the Residual vs Leverage plot tells me there is not a highly influensial oberservation overly skewing our predictor. WHich is good
set.seed(11)
#both reduce vairance
#if you know your variables all very usefuls, use ridge
# Ridge alpha = 0. you are desensitizing your model to the training data. Good to combat overfitting issue of best least squares line
alpha0.fit <- cv.glmnet(x = data.matrix(data_train[,2:39]), y = data_train[,1], type.measure = "mse", alpha = 0, family= "gaussian")
alpha0.predicted <- predict(alpha0.fit, s=alpha0.fit$lambda.1se, newx= data.matrix(data_test[,2:39]))
# Lasso alpha = 1. similar equations, BUTT helps remove useless variables
alpha1.fit <- cv.glmnet(x = data.matrix(data_train[,2:39]), y = data_train[,1], type.measure = "mse", alpha = 1, family= "gaussian")
alpha1.predicted <- predict(alpha1.fit, s=alpha1.fit$lambda.1se, newx= data.matrix(data_test[,2:39]))
MAE(data_test[,1],alpha0.predicted)
## [1] 18438.22
RMSE(data_test[,1],alpha0.predicted)
## [1] 25375.74
MAE(data_test[,1],alpha1.predicted)
## [1] 17607.39
RMSE(data_test[,1],alpha1.predicted)
## [1] 24931.68
# So we can see the differences in error between the weights of Ridge and Lasso
# We will create a Loop to find the best alpha value!
list.of.fits <- list()
for (i in 0:10) {
fit.name <- paste0("alpha", i/10) #create name alpha with the alpha value as loop increments
list.of.fits[[fit.name]] <- cv.glmnet(x = data.matrix(data_train[,2:39]), y = data_train[,1], type.measure="mae", alpha=i/10, family="gaussian")
}
results <- data.frame() # Database for the resulting error matrix
for (i in 0:10) {
fit.name <- paste0("alpha", i/10)
## Use each model to predict 'y' given the Testing dataset
predicted <- predict(list.of.fits[[fit.name]], s=list.of.fits[[fit.name]]$lambda.1se, newx=data.matrix(data_test[,2:39]))
## Error
RMSE = sqrt(mean((predicted-data_test[,1])^2))
## Results
temp <- data.frame(alpha=i/10, RMSE=RMSE, fit.name=fit.name)
results <- rbind(results, temp)
}
results
# We favour more of the lasso technique with an alpha of 0.6
# Lasso alpha = 1. similar equations, BUTT helps remove useless variables
ENR <- cv.glmnet(x = data.matrix(data_train[,2:39]), y = data_train[,1], type.measure = "mse", alpha = 0.6, family= "gaussian")
ENR.predicted <- predict(ENR, s=ENR$lambda.1se, newx= data.matrix(data_test[,2:39]))
MAE <- function(actual, predicted){
mean(abs(actual- predicted))
}
RMSE <- function(actual, predicted){
sqrt(mean((predicted-actual)^2))
}
print("For Regression Models RMSE and MAE are commonly used when comparing regression models")
## [1] "For Regression Models RMSE and MAE are commonly used when comparing regression models"
temp_DT1 <- data.frame(Model="Decision Tree 1", RMSE=RMSE(rdata_test$SalePrice, tree_pred), MAE= MAE(rdata_test$SalePrice, tree_pred))
temp_DT2 <- data.frame(Model="Decision Tree 2", RMSE=RMSE(data_test$SalePrice, p1), MAE= MAE(data_test$SalePrice, p1))
temp_RF <- data.frame(Model="Random Forest", RMSE=RMSE(rdata_test$SalePrice,rf_predict), MAE= MAE(rdata_test$SalePrice,rf_predict))
temp_ENReg <- data.frame(Model="Elastic Net Regression", RMSE=RMSE(data_test$SalePrice,ENR.predicted), MAE= MAE(data_test$SalePrice,ENR.predicted))
err_results <- rbind(temp_DT1, temp_DT2, temp_RF, temp_ENReg)
err_results
MAE <- function(actual, predicted){
mean(abs(actual- predicted))
}
RMSE <- function(actual, predicted){
sqrt(mean((predicted-actual)^2))
}
noout_DT1 <- tree(rdata_train_noout$SalePrice ~ . , data = rdata_train_noout)
DT1_pred <- predict(noout_DT1, rdata_test_noout)
# Removed 2nd type of DT because we saw that it performed worse than the other DT.
noout_RF<- randomForest(rdata_train_noout$SalePrice ~ ., data = rdata_train_noout, proximity=TRUE)
rf_pred = predict(noout_RF,rdata_test_noout)
noout_ENR <- cv.glmnet(x = data.matrix(data_train_noout[,2:39]), y = data_train_noout[,1], type.measure = "mse", alpha = 0.6, family= "gaussian")
END_pred <- predict(noout_ENR, s=noout_ENR$lambda.1se, newx= data.matrix(data_test_noout[,2:39]))
DT1 <- data.frame(Model="Decision Tree", RMSE=RMSE(rdata_test_noout$SalePrice, DT1_pred), MAE= MAE(rdata_test_noout$SalePrice, DT1_pred))
RF <- data.frame(Model="Random Forest", RMSE=RMSE(rdata_test_noout$SalePrice,rf_pred), MAE= MAE(rdata_test_noout$SalePrice,rf_pred))
ENReg <- data.frame(Model="Elastic Net Regression", RMSE=RMSE(data_test_noout$SalePrice,END_pred), MAE= MAE(data_test_noout$SalePrice,END_pred))
error_results <- rbind(DT1,RF,ENReg)
error_results
#Looking into the new data without outliers
outregressive_model <- lm(rdata_train_noout$SalePrice ~ ., data = rdata_train_noout)
plot(outregressive_model)
## Warning: not plotting observations with leverage one:
## 107, 146, 236, 283, 343, 406, 416, 450, 484, 608, 656, 705, 707, 751, 780, 876, 971, 997
## Warning: not plotting observations with leverage one:
## 107, 146, 236, 283, 343, 406, 416, 450, 484, 608, 656, 705, 707, 751, 780, 876, 971, 997
# Residual vs Fitted: There is a pattern between Residuals and Fitted values. THis is not good. this should be random. BAD
# QQ-plot: We wanted this to be
# Scale_Location: Here we clearly see the most of the data follow an non-linear form. Increasing
# FInally the Residual vs Leverage plot tells me there is not a highly influensial oberservation overly skewing our predictor. WHich is good
It is evident that removing those outliers definetly helped imporve our Regression Tree and Regression Forest, however our Elastic net regression did not chnage too much.